import pandas as pd
import re
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

c = 1000
verbs = True

df = pd.read_csv('../data/gpo_final_data/narratives_complete_with_metadata_manual_labels_rich_{0}.csv'.format(c))

# Generate speaker-session and ID and drop those not matched to Congress member
df['id'] = df['wikipedia_id'] + '_' + df['congress'].astype('str')
df = df.dropna(subset=['id'])
df['id'] = [re.sub('[^A-Za-z0-9]+', '', i).lower() for i in list(df['id'])]

# Drop independent Congresspeople
df = df[(df.party == 'Republican')|(df.party == 'Democrat')]

# Add region info and keep only those where available
gst = pd.read_csv('../data/metadata/gst_metadata_ecm_2019.csv')
gst['state'] = gst['state'].str.lower()
gst['region'] = gst['region'].str.lower()
gst = gst.drop_duplicates(subset=['state'])

df = df.merge(gst[['state', 'region']], on=['state'], how='left', indicator=True)
df = df[df._merge == 'both']

# Group entities and replace punctuation (like F.B.I.) for CountVectorizer
df['ARGO'] = [i.replace(' ', '') for i in list(df['ARGO'])]
df['ARG1'] = [i.replace(' ', '') for i in list(df['ARG1'])]

if verbs == True:
    df['B-V-CLEANED'] = [i.replace(' ', '') for i in list(df['B-V-CLEANED'])]
    df['B-V-CLEANED'] = [i.replace('-', '') for i in list(df['B-V-CLEANED'])]
    df['entities'] = df['ARGO'] + ' ' + df['ARG1'] + ' ' + df['B-V-CLEANED']
else:
    df['entities'] = df['ARGO'] + ' ' + df['ARG1']

df['entities'] = [re.sub('[^A-Za-z0-9 _]+', '', i) for i in list(df['entities'])]

# Group all pronounced entities by ID
df_ent = df.groupby('id').entities.apply(' '.join).reset_index()

# Replace punctuation in narratives (like F.B.I.) for CountVectorizer
df['narrative'] = [re.sub('[^A-Za-z0-9 _]+', '', i) for i in list(df['narrative'])]
df['narrative'] = [i.replace(' ', 'XXX') for i in list(df['narrative'])]

# Group all pronounced narratives by ID
df_narr = df.groupby('id').narrative.apply(' '.join).reset_index()

# Get document-term-matrix for entities
model = CountVectorizer(ngram_range=(1, 1))
doc_term = model.fit_transform(df_ent.entities)
model_vocab = model.vocabulary_
model_vocab_keys = list(model_vocab.keys())
print('Size of model vocabulary:', len(model_vocab))
doc_term = doc_term.todense()
df_dtm = pd.DataFrame(doc_term, columns=model_vocab_keys)
df_ent = df_ent[['id']].join(df_dtm)
if verbs == True:
    df_ent.to_csv('../data/temp/gst_dtm_entities_verbs.csv', index=False)
else:
    df_ent.to_csv('../data/temp/gst_dtm_entities.csv', index=False)

# Get document-term-matrix for narratives
model = CountVectorizer(ngram_range=(1, 1))
doc_term = model.fit_transform(df_narr.narrative)
model_vocab = model.vocabulary_
model_vocab_keys = list(model_vocab.keys())
print('Size of model vocabulary:', len(model_vocab))
doc_term = doc_term.todense()
model_vocab_keys = [i.replace('xxx', '_') for i in model_vocab_keys]
df_dtm = pd.DataFrame(doc_term, columns=model_vocab_keys)
df_narr = df_narr[['id']].join(df_dtm)
df_narr.to_csv('../data/temp/gst_dtm_narratives.csv', index=False)

# Generate speaker metadata
df = df.drop_duplicates(subset=['id'])
df['republican'] = np.where(df['party'] == 'Republican', 1, 0)
df.rename(columns={'congress': 'session'}, inplace=True)
df['gender'] = np.where(df['gender'] == 'F', 1, 0)
df['chamber'] = np.where(df['chamber'] == 'senate', 1, 0)
df = df[['id', 'republican', 'session', 'state', 'gender', 'chamber', 'region']]
df.to_csv('../data/temp/gst_speaker_metadata.csv', index=False)
